#!/usr/bin/python
# coding=utf-8
# create :maizai1994 on 2018/8/7

import time
import random
import json
import requests

#  URL = http://m.maoyan.com/mmdb/comments/movie/341516.json?_v_=yes&offset=1
# 下载一页数据
def get_one_page(url):
    headers = {'Accept': '*/*',
               'Accept-Language': 'en-US,en;q=0.8',
               'Cache-Control': 'max-age=0',
               'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
               'Connection': 'keep-alive',
               'Referer': 'http://www.baidu.com/'
               }

    response = requests.get(url,headers = headers)

    if response.status_code == 200 :
        return response.text
    return None


# 解析一页数据
def parse_one_page(html):
    data = json.loads(html)["cmts"] # 评论

    for item in data:
        yield {
            "comment": item["content"],
            "date" : item["time"].split(' ')[0],
            "rate" : item["score"],
            "city" : item["cityName"],
            "nickname" : item["nickName"]
        }

# 保存到文本里
def save_to_txt():
    for i in range(1,1001):
        url = "http://m.maoyan.com/mmdb/comments/movie/341516.json?_v_=yes&offset=" + str(i)
        html = get_one_page(url)
        print("正在保存第" + str(i) + "页")

        for item in parse_one_page(html):
            with open("狄仁杰.txt",'a',encoding="utf-8") as f:
                f.writelines(item["date"] + ',' + item["nickname"] + ','
                             + str(item["rate"]) + ',' + item["city"] +"," + item["comment"] + '\n')

        # 反爬
        time.sleep(5 + float(random.randint(1,100)) / 20)

# 去除重复的文本
def delete_repeat(old,new):
    oldFile = open(old,"r",encoding="utf-8")
    newFile = open(new,"w",encoding="utf-8")
    comment_list = oldFile.readlines()
    repeat_line = []
    for item in comment_list:
        if item not in repeat_line:
            newFile.writelines(item)
            repeat_line.append(item)


if __name__ == "__main__":
    save_to_txt()
    delete_repeat(r"狄仁杰.txt",r"狄仁杰_new.txt")
